En este análisis el equipo Omega exploramos datos de tiendas OXXOs
para detectar patrones de tiendas exitosas a partir de nuevas variables
¡Empecemos!
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(caret)
## Warning: package 'caret' was built under R version 4.4.1
## Cargando paquete requerido: ggplot2
## Cargando paquete requerido: lattice
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.1
##
## Adjuntando el paquete: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(lightgbm)
## Warning: package 'lightgbm' was built under R version 4.4.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.1
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Adjuntando el paquete: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(rpart)
library(nnet)
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
##
## Adjuntando el paquete: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(e1071)
library(ggcorrplot)
## Warning: package 'ggcorrplot' was built under R version 4.4.3
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.1
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
##
## Adjuntando el paquete: 'naniar'
## The following object is masked from 'package:skimr':
##
## n_complete
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Adjuntando el paquete: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(themis)
## Warning: package 'themis' was built under R version 4.4.3
## Cargando paquete requerido: recipes
## Warning: package 'recipes' was built under R version 4.4.1
##
## Adjuntando el paquete: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.4.3
##
## Adjuntando el paquete: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 4.4.1
##
## Adjuntando el paquete: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(broom)
library(themis)
library(recipes)
library(xgboost)
library(caret)
library(MLmetrics)
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.4.3
## Cargando paquete requerido: foreach
## Warning: package 'foreach' was built under R version 4.4.1
## Cargando paquete requerido: iterators
## Warning: package 'iterators' was built under R version 4.4.1
## Cargando paquete requerido: parallel
library(caret)
library(caretEnsemble)
## Warning: package 'caretEnsemble' was built under R version 4.4.3
library(doParallel)
df <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE)
miss_var_summary(df) %>%
arrange(desc(pct_miss)) %>%
print(n = nrow(.))
## # A tibble: 23 × 3
## variable n_miss pct_miss
## <chr> <int> <num>
## 1 tienda_id 0 0
## 2 plaza_cve 0 0
## 3 nivelsocioeconomico_des 0 0
## 4 entorno_des 0 0
## 5 mts2ventas_num 0 0
## 6 puertasrefrig_num 0 0
## 7 cajonesestacionamiento_num 0 0
## 8 segmento_maestro_desc 0 0
## 9 lid_ubicacion_tienda 0 0
## 10 dataset 0 0
## 11 meta_venta 0 0
## 12 porcentaje_cumplimiento 0 0
## 13 venta_promedio 0 0
## 14 dist_cerca 0 0
## 15 latitud_num 0 0
## 16 longitud_num 0 0
## 17 num_escuelas 0 0
## 18 num_abarrotes 0 0
## 19 num_super 0 0
## 20 num_farmacia 0 0
## 21 num_oficina 0 0
## 22 num_establecimientos 0 0
## 23 num_gasolinera 0 0
num_cols <- names(df)[sapply(df, is.numeric)]
for(col in num_cols){
ggplot(df, aes_string(x = col)) +
geom_histogram(bins = 30) +
theme_minimal() +
labs(title = paste("Histograma de", col)) -> p
print(p)
}
num_cols <- names(df)[sapply(df, is.numeric)]
plots <- lapply(num_cols, function(col) {
ggplot(df, aes_string(x = "1", y = col)) +
geom_boxplot() +
theme_minimal() +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(title=paste("Boxplot de", col))
})
for(i in seq(1, length(plots), by=4)){
grid.arrange(grobs=plots[i:min(i+3,length(plots))], ncol=4)
}
# Carga de librerías
library(dplyr)
library(geosphere)
# Lectura de datos
df <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE)
sev <- read.csv("7eleven_mty_tamps.csv", stringsAsFactors = FALSE)
# 1. Filtrar filas con coordenadas válidas
df_geo <- df %>% filter(!is.na(latitud_num), !is.na(longitud_num))
sev_geo <- sev %>% filter(!if_all(everything(), is.na))
# 2. Encontrar dinámicamente los nombres de columnas de latitud/longitud en sev
lat_col <- grep("lat", names(sev_geo), ignore.case = TRUE, value = TRUE)[1]
lng_col <- grep("lon|lng|long", names(sev_geo), ignore.case = TRUE, value = TRUE)[1]
# 3. Preparar matriz de coordenadas de los 7-Eleven usando esos nombres
coords_sev <- sev_geo %>%
select(
lng = all_of(lng_col),
lat = all_of(lat_col)
)
# 4. Calcular dist_se (distancia mínima de cada OXXO al 7-Eleven más cercano)
df_geo$dist_se <- apply(
df_geo[, c("longitud_num", "latitud_num")], 1,
function(row) {
origen <- c(as.numeric(row["longitud_num"]), as.numeric(row["latitud_num"]))
destinos <- as.matrix(coords_sev)
dists <- distHaversine(origen, destinos)
round(min(dists, na.rm = TRUE), 2)
}
)
df <- df %>%
left_join(df_geo %>% select(tienda_id, dist_se), by = "tienda_id")
library(dplyr)
library(ggcorrplot)
# Preparamos df2: convertimos caracteres a factores, creamos log_dist_cerca y conservamos dist_se
df2_trans <- df %>%
rename_all(tolower) %>%
mutate(
across(where(is.character), as.factor),
log_dist_cerca = log1p(dist_cerca)
) %>%
select(-tienda_id, -latitud_num, -longitud_num)
# Ajustamos el modelo ANOVA
fit_trans <- aov(
venta_promedio ~
nivelsocioeconomico_des +
entorno_des +
segmento_maestro_desc +
lid_ubicacion_tienda +
plaza_cve +
mts2ventas_num +
puertasrefrig_num +
cajonesestacionamiento_num +
porcentaje_cumplimiento +
log_dist_cerca + # transformada
dist_se + # sin transformar
num_escuelas +
num_abarrotes +
num_super +
num_farmacia +
num_oficina +
num_establecimientos +
num_gasolinera,
data = df2_trans
)
# Mostramos el output tradicional de ANOVA
summary(fit_trans)
## Df Sum Sq Mean Sq F value Pr(>F)
## nivelsocioeconomico_des 6 1.457e+13 2.429e+12 26.480 < 2e-16 ***
## entorno_des 3 1.876e+12 6.253e+11 6.818 0.000150 ***
## segmento_maestro_desc 4 9.264e+11 2.316e+11 2.525 0.039424 *
## lid_ubicacion_tienda 4 1.776e+12 4.441e+11 4.842 0.000716 ***
## plaza_cve 1 3.638e+11 3.638e+11 3.966 0.046682 *
## mts2ventas_num 1 6.037e+11 6.037e+11 6.582 0.010444 *
## puertasrefrig_num 1 6.391e+10 6.391e+10 0.697 0.404063
## cajonesestacionamiento_num 1 1.048e+10 1.048e+10 0.114 0.735456
## porcentaje_cumplimiento 1 4.380e+13 4.380e+13 477.526 < 2e-16 ***
## log_dist_cerca 1 7.261e+11 7.261e+11 7.916 0.004994 **
## dist_se 1 1.153e+11 1.153e+11 1.257 0.262510
## num_escuelas 1 2.558e+10 2.558e+10 0.279 0.597533
## num_abarrotes 1 6.795e+11 6.795e+11 7.409 0.006602 **
## num_super 1 3.703e+09 3.703e+09 0.040 0.840794
## num_farmacia 1 3.345e+10 3.345e+10 0.365 0.546032
## num_oficina 1 1.974e+11 1.974e+11 2.152 0.142666
## num_establecimientos 1 2.397e+09 2.397e+09 0.026 0.871609
## num_gasolinera 1 2.330e+10 2.330e+10 0.254 0.614390
## Residuals 1022 9.374e+13 9.172e+10
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Matriz de correlación numérica (incluye log_dist_cerca y dist_se)
numeric_vars <- df2_trans %>%
select(where(is.numeric)) %>%
select(-venta_promedio)
M <- cor(numeric_vars, use = "pairwise.complete.obs")
ggcorrplot(M, lab = TRUE)
df <- df %>%
mutate(exitoso = as.integer(porcentaje_cumplimiento > 91))
tab <- table(df$exitoso)
cat("Conteo:\n"); print(tab)
## Conteo:
##
## 0 1
## 263 791
cat("\n%:\n"); print(round(prop.table(tab)*100,2))
##
## %:
##
## 0 1
## 24.95 75.05
write.csv(df, "df_exit.csv", row.names=FALSE)
# 1) Cargar librerías
library(dplyr)
library(leaflet)
# 2) Leer datos
df <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE)
# 3) Filtrar filas con coordenadas válidas
df_map <- df %>%
filter(!is.na(latitud_num), !is.na(longitud_num))
# 4) Construir el mapa
leaflet(df_map) %>%
addTiles() %>% # capa base OpenStreetMap
addCircleMarkers(
lng = ~longitud_num,
lat = ~latitud_num,
radius = 4,
color = "red",
stroke = FALSE,
fillOpacity = 0.6,
popup = ~paste0(
"<strong>Tienda:</strong> ", tienda_id, "<br/>",
"<strong>% Cumplimiento:</strong> ", porcentaje_cumplimiento, "%"
)
) %>%
setView(
lng = mean(df_map$longitud_num, na.rm=TRUE),
lat = mean(df_map$latitud_num, na.rm=TRUE),
zoom = 7
)
library(dplyr)
library(leaflet)
# 1) Leer tu segundo dataset (ajusta la ruta/nombre de archivo)
df7e <- read.csv("7eleven_mty_tamps.csv", stringsAsFactors = FALSE)
# 2) Filtrar filas con coordenadas válidas
df7e <- df7e %>%
filter(!is.na(lat), !is.na(lng))
# 3) Crear el mapa con puntos verdes
leaflet(df7e) %>%
addTiles() %>%
addCircleMarkers(
lng = ~lng,
lat = ~lat,
radius = 5,
color = "darkgreen",
fillColor = "green",
fillOpacity = 0.7,
stroke = TRUE,
popup = ~paste0(
"<strong>", name, "</strong><br/>",
address, "<br/>",
"CP buscado: ", postal_code_searched
)
) %>%
setView(
lng = mean(df7e$lng, na.rm = TRUE),
lat = mean(df7e$lat, na.rm = TRUE),
zoom = 7
)
# Carga de librerías
library(dplyr)
library(tidyr)
library(geosphere)
library(caret)
library(recipes)
library(themis)
library(MLmetrics)
# 1) Leer datos y crear variable objetivo + log_dist_cerca
df <- read.csv("oxxo_tiendas_ext.csv", stringsAsFactors = FALSE) %>%
mutate(
exitoso = if_else(porcentaje_cumplimiento > 91, "Yes", "No"),
exitoso = factor(exitoso, levels = c("No","Yes")),
log_dist_cerca = log1p(dist_cerca)
)
# 2) Leer 7-Eleven y calcular dist_se con Haversine
sev <- read.csv("7eleven_mty_tamps.csv", stringsAsFactors = FALSE) %>%
filter(!is.na(lat), !is.na(lng))
haversine <- function(lon1, lat1, lon2, lat2) {
to_rad <- pi/180
lon1 <- lon1 * to_rad; lat1 <- lat1 * to_rad
lon2 <- lon2 * to_rad; lat2 <- lat2 * to_rad
dlon <- lon2 - lon1; dlat <- lat2 - lat1
a <- sin(dlat/2)^2 + cos(lat1)*cos(lat2)*sin(dlon/2)^2
2 * 6371000 * asin(pmin(1, sqrt(a)))
}
coords_sev <- sev %>% select(lng, lat)
df <- df %>%
filter(!is.na(latitud_num), !is.na(longitud_num)) %>%
rowwise() %>%
mutate(
dist_se = {
origen <- c(longitud_num, latitud_num)
dists <- haversine(
lon1 = origen[1], lat1 = origen[2],
lon2 = coords_sev$lng, lat2 = coords_sev$lat
)
round(min(dists, na.rm = TRUE), 2)
}
) %>%
ungroup()
# 3) Dividir TRAIN/TEST
train_df <- df %>% filter(dataset == "TRAIN")
test_df <- df %>% filter(dataset == "TEST")
# 4) Target-encoding para categóricas
seg_means <- train_df %>%
group_by(segmento_maestro_desc) %>%
summarise(enc_segmento = mean(exitoso == "Yes"), .groups = "drop")
lid_means <- train_df %>%
group_by(lid_ubicacion_tienda) %>%
summarise(enc_lid = mean(exitoso == "Yes"), .groups = "drop")
train_df <- train_df %>%
left_join(seg_means, by = "segmento_maestro_desc") %>%
left_join(lid_means, by = "lid_ubicacion_tienda")
test_df <- test_df %>%
left_join(seg_means, by = "segmento_maestro_desc") %>%
left_join(lid_means, by = "lid_ubicacion_tienda")
global_seg <- mean(train_df$enc_segmento, na.rm = TRUE)
global_lid <- mean(train_df$enc_lid, na.rm = TRUE)
train_df <- train_df %>%
mutate(
enc_segmento = replace_na(enc_segmento, global_seg),
enc_lid = replace_na(enc_lid, global_lid)
)
test_df <- test_df %>%
mutate(
enc_segmento = replace_na(enc_segmento, global_seg),
enc_lid = replace_na(enc_lid, global_lid)
)
# 5) Socioeconómico ordinal
niveles <- c("A","AB","B","BC","C","CD","D")
train_df <- train_df %>%
mutate(
nivel_socio_ord = as.integer(
factor(nivelsocioeconomico_des, levels = niveles, ordered = TRUE)
)
)
test_df <- test_df %>%
mutate(
nivel_socio_ord = as.integer(
factor(nivelsocioeconomico_des, levels = niveles, ordered = TRUE)
)
)
# 6) Eliminar columnas meta e irrelevantes
drops <- c(
"dataset","tienda_id","plaza_cve","latitud_num","longitud_num",
"venta_promedio","porcentaje_cumplimiento",
"nivelsocioeconomico_des","entorno_des",
"segmento_maestro_desc","lid_ubicacion_tienda",
"num_escuelas","dist_cerca"
)
train_df <- train_df %>% select(-any_of(drops))
test_df <- test_df %>% select(-any_of(drops))
# 7) Eliminar predictores constantes
preds <- setdiff(names(train_df), "exitoso")
consts <- preds[sapply(train_df[preds], n_distinct) == 1]
train_df <- train_df %>% select(-all_of(consts))
test_df <- test_df %>% select(-all_of(consts))
# 8) Recipe: SMOTE, dummies, centrar y escalar
set.seed(123)
rec <- recipe(exitoso ~ ., data = train_df) %>%
step_smote(exitoso) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_center(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
prep()
train_proc <- bake(rec, new_data = NULL)
test_proc <- bake(rec, new_data = test_df)
# 9) Separar X/y
X_train <- train_proc %>% select(-exitoso)
y_train <- train_proc$exitoso
X_test <- test_proc %>% select(-exitoso)
y_test <- test_proc$exitoso
# 9b) Para compatibilidad con los chunks existentes:
X_train_scaled <- X_train
X_test_scaled <- X_test
pos_lvl <- "Yes"
ctrl_acc <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = defaultSummary,
savePredictions = "final"
)
# <h2 style="color:gold;">Logistic Regression con Elastic Net (glmnet)</h2>
library(glmnet)
library(caret)
# Convertir a matriz para glmnet
X_mat <- as.matrix(X_train)
y_bin <- ifelse(y_train=="Yes",1,0)
# Control CV
ctrl <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
summaryFunction = defaultSummary
)
# Entrenar Elastic Net optimizando Accuracy
set.seed(123)
grid <- expand.grid(
alpha = seq(0,1, length=5),
lambda = 10^seq(-4, 0, length=20)
)
model_glmnet <- train(
x = X_train,
y = y_train,
method = "glmnet",
metric = "Accuracy",
trControl = ctrl,
tuneGrid = grid
)
# Evaluar en TEST
probs_enet <- predict(model_glmnet, X_test, type="prob")[, "Yes"]
best_t <- seq(0.1,0.9,0.01)[ which.max(sapply(seq(0.1,0.9,0.01), function(t)
mean(factor(ifelse(probs_enet>t,"Yes","No"),
levels=c("No","Yes")) == y_test)
))]
pred_enet <- factor(ifelse(probs_enet>best_t,"Yes","No"), levels=c("No","Yes"))
cm_enet <- confusionMatrix(pred_enet, y_test, positive="Yes")
# Métricas
cat("Elastic Net (α=",model_glmnet$bestTune$alpha,
", λ=",model_glmnet$bestTune$lambda,")\n", sep="")
## Elastic Net (α=0, λ=0.05455595)
cat("Best threshold:", best_t, "\n")
## Best threshold: 0.36
cat("Accuracy: ", round(cm_enet$overall["Accuracy"], 3), "\n")
## Accuracy: 0.781
cat("Precision: ", round(cm_enet$byClass["Pos Pred Value"], 3), "\n")
## Precision: 0.781
cat("Sensitivity:", round(cm_enet$byClass["Sensitivity"], 3), "\n")
## Sensitivity: 0.974
cat("F1 Score: ", round(F1_Score(y_test, pred_enet, positive="Yes"), 3), "\n")
## F1 Score: 0.867
# 1) Entrenar KNN optimizando Accuracy
model_knn_acc <- train(
x = X_train_scaled,
y = y_train,
method = "knn",
metric = "Accuracy",
trControl = ctrl_acc,
tuneLength= 5
)
# 2) Predecir probabilidades “Yes” en test
probs_knn <- predict(model_knn_acc, X_test_scaled, type = "prob")[, pos_lvl]
# 3) Buscar umbral que maximice Accuracy
ths_knn <- seq(0.1, 0.9, by = 0.01)
accs_knn <- sapply(ths_knn, function(t) {
preds_t <- factor(ifelse(probs_knn > t, pos_lvl, "No"),
levels = c("No","Yes"))
mean(preds_t == y_test)
})
best_t_knn <- ths_knn[which.max(accs_knn)]
cat("KNN best threshold (Accuracy):", best_t_knn, "\n\n")
## KNN best threshold (Accuracy): 0.1
# 4) Evaluación final
pred_knn <- factor(ifelse(probs_knn > best_t_knn, pos_lvl, "No"),
levels = c("No","Yes"))
cm_knn <- confusionMatrix(pred_knn, y_test, positive = pos_lvl)
# 5) Métricas (Accuracy primero)
cat("Accuracy: ", round(cm_knn$overall["Accuracy"], 3), "\n")
## Accuracy: 0.724
cat("Precision: ", round(cm_knn$byClass["Pos Pred Value"], 3), "\n")
## Precision: 0.745
cat("Sensitivity:", round(cm_knn$byClass["Sensitivity"], 3), "\n")
## Sensitivity: 0.948
cat("F1 Score: ", round(F1_Score(y_test, pred_knn, positive = pos_lvl), 3), "\n")
## F1 Score: 0.834
# 1) Entrenar árbol optimizando Accuracy
model_tree_acc <- train(
x = X_train_scaled,
y = y_train,
method = "rpart",
metric = "Accuracy",
trControl = ctrl_acc,
tuneLength= 5
)
# 2) Predecir probabilidades “Yes” en test
probs_tree <- predict(model_tree_acc, X_test_scaled, type = "prob")[, pos_lvl]
# 3) Buscar umbral que maximice Accuracy
ths_tree <- seq(0.1, 0.9, by = 0.01)
accs_tree <- sapply(ths_tree, function(t) {
preds_t <- factor(ifelse(probs_tree > t, pos_lvl, "No"),
levels = c("No","Yes"))
mean(preds_t == y_test)
})
best_t_tree <- ths_tree[which.max(accs_tree)]
cat("Tree best threshold (Accuracy):", best_t_tree, "\n\n")
## Tree best threshold (Accuracy): 0.1
# 4) Evaluación final
pred_tree <- factor(ifelse(probs_tree > best_t_tree, pos_lvl, "No"),
levels = c("No","Yes"))
cm_tree <- confusionMatrix(pred_tree, y_test, positive = pos_lvl)
# 5) Métricas (Accuracy primero)
cat("Accuracy: ", round(cm_tree$overall["Accuracy"], 3), "\n")
## Accuracy: 0.733
cat("Precision: ", round(cm_tree$byClass["Pos Pred Value"], 3), "\n")
## Precision: 0.733
cat("Sensitivity:", round(cm_tree$byClass["Sensitivity"], 3), "\n")
## Sensitivity: 1
cat("F1 Score: ", round(F1_Score(y_test, pred_tree, positive = pos_lvl), 3), "\n")
## F1 Score: 0.846
# 1) Entrenar RF optimizando Accuracy
model_rf_acc <- train(
x = X_train_scaled,
y = y_train,
method = "rf",
metric = "Accuracy",
trControl = ctrl_acc,
tuneLength= 5
)
# 2) Umbral
probs_rf <- predict(model_rf_acc, X_test_scaled, type="prob")[, pos_lvl]
ths_rf <- seq(0.1,0.9,by=0.01)
accs_rf <- sapply(ths_rf, function(t){
preds <- factor(ifelse(probs_rf>t,pos_lvl,"No"), levels=c("No","Yes"))
mean(preds==y_test)
})
best_t_rf <- ths_rf[which.max(accs_rf)]
cat("RF best threshold (Accuracy):", best_t_rf, "\n")
## RF best threshold (Accuracy): 0.16
# 3) Métricas
pred_rf <- factor(ifelse(probs_rf>best_t_rf,pos_lvl,"No"), levels=c("No","Yes"))
cm_rf <- confusionMatrix(pred_rf, y_test, positive=pos_lvl)
cat("Accuracy:", round(cm_rf$overall["Accuracy"],3), "\n")
## Accuracy: 0.743
cat("Precision:", round(cm_rf$byClass["Pos Pred Value"],3), "\n")
## Precision: 0.74
cat("Sensitivity:", round(cm_rf$byClass["Sensitivity"],3), "\n")
## Sensitivity: 1
cat("F1:", round(F1_Score(y_test,pred_rf,positive=pos_lvl),3), "\n")
## F1: 0.851
# 1) Paralelización
library(doParallel)
cores <- detectCores() - 1
cl <- makeCluster(cores)
registerDoParallel(cl)
# 2) Control de entrenamiento: 5‐fold CV + búsqueda aleatoria
library(caret)
ctrl_rand5 <- trainControl(
method = "cv",
number = 5,
search = "random",
classProbs = TRUE,
summaryFunction = defaultSummary,
allowParallel = TRUE
)
# 3) Entrenar XGBoost con tuneLength = 15 en 5 folds
set.seed(2025)
xgb_rand5 <- train(
x = X_train_scaled,
y = y_train,
method = "xgbTree",
metric = "Accuracy",
trControl = ctrl_rand5,
tuneLength = 15
)
print(xgb_rand5) # parámetros óptimos
## eXtreme Gradient Boosting
##
## 1428 samples
## 15 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 1142, 1143, 1143, 1142, 1142
## Resampling results across tuning parameters:
##
## eta max_depth gamma colsample_bytree min_child_weight
## 0.05884483 6 7.6780792 0.6594054 5
## 0.07951405 5 9.6561064 0.5422108 14
## 0.07982717 3 2.6295775 0.3342711 11
## 0.09234926 1 4.9586521 0.6485793 9
## 0.10843767 3 3.3275529 0.6712665 2
## 0.21180211 5 9.9572537 0.4436867 7
## 0.21571093 3 1.0278958 0.4395477 7
## 0.27287772 1 2.1741177 0.6337370 19
## 0.31261424 2 1.8580264 0.4082290 7
## 0.32493396 9 1.0196797 0.5358437 11
## 0.35449315 4 6.1169863 0.3772096 1
## 0.36086623 2 0.6920193 0.5073561 12
## 0.44186551 8 7.2200757 0.3873159 20
## 0.44968516 7 7.4204021 0.6276336 7
## 0.45926188 10 4.3026438 0.3252454 1
## subsample nrounds Accuracy Kappa
## 0.8555164 510 0.8053392 0.6106494
## 0.8927929 922 0.7885290 0.5770477
## 0.4332607 461 0.7955441 0.5910779
## 0.6650177 373 0.7794136 0.5588062
## 0.3401240 266 0.7822329 0.5644197
## 0.3746437 900 0.7738265 0.5476364
## 0.6116031 972 0.7885241 0.5770161
## 0.3522443 961 0.7808073 0.5615920
## 0.7868055 881 0.7941357 0.5882429
## 0.8013416 279 0.7864188 0.5728110
## 0.7929506 539 0.7920255 0.5840250
## 0.8039660 891 0.7941504 0.5882774
## 0.4021175 187 0.7458226 0.4916174
## 0.9474417 932 0.7955343 0.5910411
## 0.7491381 983 0.7927273 0.5854072
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 510, max_depth = 6, eta
## = 0.05884483, gamma = 7.678079, colsample_bytree = 0.6594054,
## min_child_weight = 5 and subsample = 0.8555164.
# 4) Obtener probabilidades en train y test
train_probs5 <- predict(xgb_rand5, X_train_scaled, type = "prob")[, "Yes"]
test_probs5 <- predict(xgb_rand5, X_test_scaled, type = "prob")[, "Yes"]
# 5) Buscar umbral que maximice Accuracy en TRAIN
ths <- seq(0.1, 0.9, by = 0.01)
train_accs5 <- sapply(ths, function(t) {
preds_t <- factor(ifelse(train_probs5 > t, "Yes", "No"),
levels = c("No","Yes"))
mean(preds_t == y_train)
})
best_t5 <- ths[which.max(train_accs5)]
cat("Umbral óptimo en train (5-fold):", best_t5, "\n")
## Umbral óptimo en train (5-fold): 0.5
# 6) Evaluar en TEST con ese umbral
preds5 <- factor(ifelse(test_probs5 > best_t5, "Yes", "No"),
levels = c("No","Yes"))
cm5 <- confusionMatrix(preds5, y_test, positive = "Yes")
cat("Accuracy en test (5-fold + umbral entrenado):",
round(cm5$overall["Accuracy"], 3), "\n")
## Accuracy en test (5-fold + umbral entrenado): 0.743
# 7) Cerrar cluster
stopCluster(cl)
registerDoSEQ()
# 1) Entrenar LGBM
dtrain <- lgb.Dataset(as.matrix(X_train_scaled), label = as.numeric(y_train)-1)
params <- list(objective="binary", metric="binary_logloss")
model_lgb_acc <- lgb.train(params, dtrain, nrounds=100, verbose=-1)
# 2) Umbral
probs_lgb <- predict(model_lgb_acc, as.matrix(X_test_scaled))
ths_lgb <- seq(0.1,0.9,by=0.01)
accs_lgb <- sapply(ths_lgb, function(t){
preds <- factor(ifelse(probs_lgb>t,pos_lvl,"No"), levels=c("No","Yes"))
mean(preds==y_test)
})
best_t_lgb <- ths_lgb[which.max(accs_lgb)]
cat("LGBM best threshold (Accuracy):", best_t_lgb, "\n")
## LGBM best threshold (Accuracy): 0.41
# 3) Métricas
pred_lgb <- factor(ifelse(probs_lgb>best_t_lgb,pos_lvl,"No"), levels=c("No","Yes"))
cm_lgb <- confusionMatrix(pred_lgb, y_test, positive=pos_lvl)
cat("Accuracy:", round(cm_lgb$overall["Accuracy"],3), "\n")
## Accuracy: 0.771
cat("Precision:", round(cm_lgb$byClass["Pos Pred Value"],3), "\n")
## Precision: 0.791
cat("Sensitivity:", round(cm_lgb$byClass["Sensitivity"],3), "\n")
## Sensitivity: 0.935
cat("F1:", round(F1_Score(y_test,pred_lgb,positive=pos_lvl),3), "\n")
## F1: 0.857
# 1) Entrenar MLP optimizando Accuracy
model_nnet_acc <- train(
x = X_train_scaled,
y = y_train,
method = "nnet",
metric = "Accuracy",
trControl = ctrl_acc,
tuneLength= 3,
trace = FALSE,
maxit = 1000
)
# 2) Umbral
probs_nnet <- predict(model_nnet_acc, X_test_scaled, type="prob")[, pos_lvl]
ths_nnet <- seq(0.1,0.9,by=0.01)
accs_nnet <- sapply(ths_nnet, function(t){
preds <- factor(ifelse(probs_nnet>t,pos_lvl,"No"), levels=c("No","Yes"))
mean(preds==y_test)
})
best_t_nnet <- ths_nnet[which.max(accs_nnet)]
cat("MLP best threshold (Accuracy):", best_t_nnet, "\n")
## MLP best threshold (Accuracy): 0.17
# 3) Métricas
pred_nnet <- factor(ifelse(probs_nnet>best_t_nnet,pos_lvl,"No"), levels=c("No","Yes"))
cm_nnet <- confusionMatrix(pred_nnet, y_test, positive=pos_lvl)
cat("Accuracy:", round(cm_nnet$overall["Accuracy"],3), "\n")
## Accuracy: 0.79
cat("Precision:", round(cm_nnet$byClass["Pos Pred Value"],3), "\n")
## Precision: 0.778
cat("Sensitivity:", round(cm_nnet$byClass["Sensitivity"],3), "\n")
## Sensitivity: 1
cat("F1:", round(F1_Score(y_test,pred_nnet,positive=pos_lvl),3), "\n")
## F1: 0.875
```